Aims

Analysis of First Targeted Sequencing Run on Sequel 1:
1. On-Target Rate
2. Raw Coverage
+ Number of Transcripts per Gene identified
3. Filtering by ToFu
4. Number of Isoforms per Gene identified

Iso-Seq

# Input IsoSeq Paths
CCS_input_dir <- "/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Targeted_Transcriptome/IsoSeq/CCS"
LiMA_input_dir <- "/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Targeted_Transcriptome/IsoSeq/LIMA"
REFINE_input_dir <- "/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Targeted_Transcriptome/IsoSeq/REFINE"
CLUSTER_input_dir <- "/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Targeted_Transcriptome/IsoSeq/CLUSTER"

## Input CCS, LIMA, REFINE files
# CCS, LIMA summary
CCS <- read.csv(paste0(CCS_input_dir, "/Batched_CCS_output.csv"), header = T)
LIMA <- read.csv(paste0(LiMA_input_dir, "/Batched_LIMA_summary.csv"), header = T)
# REFINE summary input
REFINE_json_list <- list.files(paste0(REFINE_input_dir), pattern = "flnc.filter_summary.json", full.names = T) %>% .[!grepl("Targeted_Seq",.)]
REFINE_list <- lapply(REFINE_json_list , function(x) as.data.frame(fromJSON(file = x)))
names(REFINE_list) <- list.files(paste0(REFINE_input_dir), pattern = "flnc.filter_summary.json") %>% .[!grepl("Targeted_Seq",.)]
for(i in 1:length(names(REFINE_list))){REFINE_list[[i]]$Sample <- names(REFINE_list)[[i]]}
# CLUSTER files 
# do not include batched report Targeted_Seq_1.clustered.cluster_report.csv
CLUSTER_input_files <- list.files(paste0(CLUSTER_input_dir), pattern = "clustered.cluster_report.csv", full.names = T) %>% .[!grepl("Targeted_Seq",.)]
CLUSTER_list <- lapply(CLUSTER_input_files, function(x) read.csv(x))
names(CLUSTER_list) <- list.files(paste0(CLUSTER_input_dir), pattern = "clustered.cluster_report.csv") %>% .[!grepl("Targeted_Seq",.)]
for(i in 1:length(names(CLUSTER_list))){CLUSTER_list[[i]]$Sample <- names(CLUSTER_list)[[i]]}

source("/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/Scripts/IsoSeq_Tg4510/1_Transcriptome_Annotation/Isoseq3_QC/RunStats_Function.R")
Reads <- number_of_reads()

Merged_CLUSTER <- read.csv("/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Targeted_Transcriptome/All_Targeted_Merged/CLUSTER/All_Targeted_Merged.clustered.cluster_report.csv")

Reads <- Reads %>% mutate(Batch = word(Reads$variable, c(3), sep = fixed("_"))) %>% 
  full_join(., targetedpheno, by = c("sample" = "Sample")) %>%
  unite("Batch", Batch.x,Batch.y, na.rm = TRUE) %>%
  mutate(Batch = recode(Batch, "3b" = "3", "3a" = "3 (partial run)")) %>%
  select(Description,value,sample,Batch) %>%
  bind_rows(data.frame(Description = "Clustered reads(Transcripts)",value = as.numeric(as.character(nrow(Merged_CLUSTER))),
           sample = "Merged", Batch = "Merged"))

pIsoSeq <- ggplot(Reads, aes(x = Description, y = value, colour = Batch, group = Batch)) +
      geom_line() + geom_point() +  mytheme + theme(legend.position = "bottom") + labs(x = "", y = "Number of Reads (Thousands)") +
  scale_y_continuous(labels = unit_format(unit = "", scale = 1e-3)) +
  theme(axis.text.x = element_text(angle = 45, hjust=1))
  
pIsoSeq

On-Target Rate

Using L.Tseng’s script (https://github.com/Magdoll/cDNA_Cupcake/wiki/Targeted-Iso-Seq-QC-Wiki) to calculate the on-target rate from polished.fa file, SAM alignment, and probe BED file.

Batch 2 and Batch 3 had more samples (n = 9) than batch 6, therefore increased propensity to sequence targeted genes

Note: This is prior to Tofu-collapse, therefore some of different FL transcripts can still be derived from the same isoform

Target Genes (Merged data)

merged data clustering, cupcake collapse, cupcake filtering, sqanti filtering filter to only relevant genes, and number of isoforms

Filtering via pipeline

All_Targeted_Merged <- "/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Targeted_Transcriptome/All_Targeted_Merged"
Merged_PostIso <- list(read.csv(paste0(All_Targeted_Merged,"/CLUSTER/All_Targeted_Merged.clustered.cluster_report.csv")), # Cluster
                       read.table(paste0(All_Targeted_Merged,"/TOFU/All_Targeted_Merged.collapsed.abundance.txt"), header = T), # Cupcake collapse
                       read.table(paste0(All_Targeted_Merged,"/TOFU/All_Targeted_Merged.collapsed.read_stat.txt"), header = T), 
                       read.table(paste0(All_Targeted_Merged,"/TOFU/All_Targeted_Merged.collapsed.filtered.abundance.txt"), as.is = T, sep = "\t", header = T), 
                       read.table(paste0(All_Targeted_Merged,"/SQANTI/All_Targeted_Merged.collapsed.filtered_classification.filtered_lite_classification.txt"),
                                  as.is = T, sep = "\t", header = T)
                       )

names(Merged_PostIso) <- c("cluster","cupcake","cupcake_readstat","cupcake_filter","sqanti_filter")


colnames(Merged_probes_files)[6] <- "Probes"
Merged_probes_files$Gene <- word(Merged_probes_files$Probes,c(3),  sep = fixed ('_'))
Merged_probes_files$Gene <- word(Merged_probes_files$Gene,c(1),  sep = fixed ('('))
Merged_probes_files$Gene[Merged_probes_files$Gene == "27642461"] <- "MAPT"
Merged_probes_files$Gene[Merged_probes_files$Gene == "27642460"] <- "ANK1"

# Filter by targeted genes
# targeted_transcripts = transcript/x of targeted genes (differented by having overlap with probes)
# targeted_read_id = mx/x/ccs of targeted genes (origianal reads)
# targeted_pbid = pbid of targeted genes (from readstat file after filtering for targeted_read_id)

targeted <- Merged_probes_files %>% filter(Merged_probes_files$num_base_overlap != "0") %>% select(read_id,Gene) %>% rename(transcript_id = read_id) %>%
  left_join(.,Merged_PostIso$cluster, by = c("transcript_id"= "cluster_id")) %>% 
  left_join(.,Merged_PostIso$cupcake_readstat, by = c("read_id"="id") )

# Example of unmapped transcript but probed
#Merged_probes_files %>% filter(read_id == "transcript/24")
#Merged_PostIso$cluster %>% filter(cluster_id == "transcript/24")
#Merged_PostIso$cupcake_readstat %>% filter(id == "m54082_200731_163617/50594276/ccs")

tgcluster <- targeted %>% group_by(transcript_id,Gene) %>% tally() %>% # multiple same transcript_id from clustering ccs reads 
  group_by(Gene) %>% tally() %>% mutate(type = "Clustered")

# PBids = NA for non-targeted genes
tgcupcake <- Merged_PostIso$cupcake %>% left_join(., unique(targeted[,c("pbid","Gene")])) %>% group_by(Gene) %>% tally() %>% filter(!is.na(Gene)) %>% mutate(type = "Cupcake")
tgcupcakefilter <- Merged_PostIso$cupcake_filter %>% left_join(., unique(targeted[,c("pbid","Gene")], by = "pbid")) %>% group_by(Gene) %>% tally() %>% filter(!is.na(Gene))  %>% mutate(type = "Cupcake Filter")

# if use probes, also misannotation of other genes nearby "Gm49227"
#tgsqfilter <- Merged_PostIso$sqanti_filter %>% left_join(., unique(targeted[,c("pbid","Gene")]), by = c("isoform" = "pbid")) %>% group_by(Gene) %>% tally() %>% filter(!is.na(Gene)) 
tgsqfilter <- Merged_PostIso$sqanti_filter %>% filter(toupper(associated_gene) %in% targeted$Gene) %>% group_by(associated_gene) %>% tally() %>% mutate(type = "Sqanti filter") %>% mutate(Gene = toupper(associated_gene))

pNumfil <- bind_rows(tgcluster, tgcupcake, tgcupcakefilter, tgsqfilter) %>%
  ggplot(., aes(x = Gene, y = n, fill = type)) + geom_bar(stat = "identity") +
  mytheme + labs(x = "", y = "Number of Transcripts (Thousand)") + theme(legend.position = c(0.8,0.8)) + 
  scale_fill_discrete(name = "", labels = c("Iso-Seq Cluster","Cupcake Collapse","Cupcake Filter","SQANTI filter")) + 
  scale_y_continuous(labels = ks) +
  theme(axis.text.x = element_text(angle = 45, hjust=1))

pNumfil

Final dataset (SQANTI)

Batch 1

##     isoform chrom strand length exons structural_category associated_gene
## 1 PB.1357.2  chr7      -   2140     5    novel_in_catalog            Cd33
## 2 PB.1357.3  chr7      -   1510     6   full-splice_match            Cd33
##   associated_transcript ref_length ref_exons diff_to_TSS diff_to_TTS
## 1                 novel       1962         7          NA          NA
## 2  ENSMUST00000205503.1       5728         6          60        4158
##   diff_to_gene_TSS diff_to_gene_TTS      subcategory RTS_stage all_canonical
## 1              -10                2 intron_retention     FALSE     canonical
## 2              -10                0       multi-exon     FALSE     canonical
##   min_sample_cov min_cov min_cov_pos sd_cov FL n_indels n_indels_junc bite
## 1             NA      NA          NA     NA  2       NA            NA TRUE
## 2             NA      NA          NA     NA  6       NA            NA TRUE
##   iso_exp gene_exp ratio_exp FSM_class coding ORF_length CDS_length CDS_start
## 1      NA       NA        NA         C coding        149        450       326
## 2      NA       NA        NA         C coding        334       1005        18
##   CDS_end CDS_genomic_start CDS_genomic_end predicted_NMD perc_A_downstream_TTS
## 1     775          43532197        43529737         FALSE                    15
## 2    1022          43533094        43528862         FALSE                    15
##   seq_A_downstream_TTS dist_to_cage_peak within_cage_peak dist_to_polya_site
## 1 GAGGTGTGGCCTTGTTGAAT               -10             True                 NA
## 2 AGGAGGTGTGGCCTTGTTGA               -10             True                 NA
##   within_polya_site polyA_motif polyA_dist
## 1                NA        <NA>         NA
## 2                NA        <NA>         NA
##     isoform chrom strand length exons structural_category associated_gene
## 1 PB.1357.3  chr7      -   1510     6   full-splice_match            Cd33
##   associated_transcript ref_length ref_exons diff_to_TSS diff_to_TTS
## 1  ENSMUST00000205503.1       5728         6          60        4158
##   diff_to_gene_TSS diff_to_gene_TTS subcategory RTS_stage all_canonical
## 1              -10                0  multi-exon     FALSE     canonical
##   min_sample_cov min_cov min_cov_pos sd_cov FL n_indels n_indels_junc bite
## 1             NA      NA          NA     NA  6       NA            NA TRUE
##   iso_exp gene_exp ratio_exp FSM_class coding ORF_length CDS_length CDS_start
## 1      NA       NA        NA         C coding        334       1005        18
##   CDS_end CDS_genomic_start CDS_genomic_end predicted_NMD perc_A_downstream_TTS
## 1    1022          43533094        43528862         FALSE                    15
##   seq_A_downstream_TTS dist_to_cage_peak within_cage_peak dist_to_polya_site
## 1 AGGAGGTGTGGCCTTGTTGA               -10             True                 NA
##   within_polya_site polyA_motif polyA_dist
## 1                NA        <NA>         NA
##                                  id length is_fl   stat      pbid
## 1 m54082_191116_131337/47644765/ccs     NA     Y unique PB.1357.3
## 2 m54082_191116_131337/57737675/ccs     NA     Y unique PB.1357.3
## 3 m54082_191116_131337/44171880/ccs     NA     Y unique PB.1357.3
## 4 m54082_191116_131337/25101285/ccs     NA     Y unique PB.1357.3
## 5 m54082_191116_131337/31195690/ccs     NA     Y unique PB.1357.3
## 6 m54082_191116_131337/40043410/ccs     NA     Y unique PB.1357.3
##                                  id length is_fl   stat       pbid
## 1 m54082_191116_131337/47644765/ccs     NA     Y unique PB.7402.33
##                                  id length is_fl   stat       pbid
## 1 m54082_191116_131337/57737675/ccs     NA     Y unique PB.7402.33
## [1] pbid          count_fl      count_nfl     count_nfl_amb norm_fl      
## [6] norm_nfl      norm_nfl_amb 
## <0 rows> (or 0-length row.names)
##  [1] isoform               chrom                 strand               
##  [4] length                exons                 structural_category  
##  [7] associated_gene       associated_transcript ref_length           
## [10] ref_exons             diff_to_TSS           diff_to_TTS          
## [13] diff_to_gene_TSS      diff_to_gene_TTS      subcategory          
## [16] RTS_stage             all_canonical         min_sample_cov       
## [19] min_cov               min_cov_pos           sd_cov               
## [22] FL                    n_indels              n_indels_junc        
## [25] bite                  iso_exp               gene_exp             
## [28] ratio_exp             FSM_class             coding               
## [31] ORF_length            CDS_length            CDS_start            
## [34] CDS_end               CDS_genomic_start     CDS_genomic_end      
## [37] predicted_NMD         perc_A_downstream_TTS seq_A_downstream_TTS 
## [40] dist_to_cage_peak     within_cage_peak      dist_to_polya_site   
## [43] within_polya_site     polyA_motif           polyA_dist           
## [46] FL.K17                FL.K18                FL.K19               
## [49] FL.K20                FL.K21                FL.K23               
## [52] FL.K24                FL.L18                FL.L22               
## [55] FL.M21                FL.O18                FL.O22               
## [58] FL.O23                FL.P19                FL.Q17               
## [61] FL.Q18                FL.Q20                FL.Q21               
## [64] FL.Q23                FL.S18                FL.S19               
## [67] FL.S23                FL.T18                FL.T20               
## <0 rows> (or 0-length row.names)
## [1] pbid          count_fl      count_nfl     count_nfl_amb norm_fl      
## [6] norm_nfl      norm_nfl_amb 
## <0 rows> (or 0-length row.names)